package com.system.weirdor.crawler.processor;

import com.system.weirdor.search.index.EsBlog;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.processor.PageProcessor;

import java.util.UUID;

/**
 * Created by Elon.su on 17/7/12.
 */
public class CnBlogFetchProcessor implements PageProcessor {
    // 抓取网站的相关配置，包括编码、抓取间隔、重试次数等
    private Site site = Site.me().setRetryTimes(3).setSleepTime(100);
    private static int count = 0;

    public Site getSite() {
        return site;
    }

    public void process(Page page) {
        //判断链接是否符合http://www.cnblogs.com/任意个数字字母-/p/7个数字.html格式
        if (!page.getUrl().regex("http://www.cnblogs.com/[a-z 0-9 -]+/p/[0-9]{7}.html").match()) {
            //加入满足条件的链接
            page.addTargetRequests(
                    page.getHtml().xpath("//*[@id=\"post_list\"]/div/div[@class='post_item_body']/h3/a/@href").all());
        } else {
            page.putField("content", page.getHtml().xpath("//*[@id=\"Header1_HeaderTitle\"]/text()").get());
            page.putField("title",  page.getHtml().xpath("//*a[@class='postTitle2']/text()").get());
            EsBlog eswb = new EsBlog();
            eswb.setTitle(UUID.randomUUID().toString());
            eswb.setContent( page.getHtml().xpath("//*[@id=\"Header1_HeaderTitle\"]/text()").toString());
            eswb.setTitle( page.getHtml().xpath("//*a[@class='postTitle2']/text()").toString());
            page.putField("eswb", eswb);

            count++;
        }
    }
}